gcqn_qsmooth <- function(counts, gcGroups, bio){
gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts)))
for(ii in 1:nlevels(gcGroups)){
id <- which(gcGroups==levels(gcGroups)[ii])
countBin <- counts[id,]
qs <- qsmooth(countBin, group_factor=bio)
normCountBin <- qs@qsmoothData
normCountBin <- round(normCountBin)
normCountBin[normCountBin<0] <- 0
gcBinNormCounts[id,] <- normCountBin
}
return(gcBinNormCounts)
}
library(grDevices)
palette(c("black", "#DF536B", "#61D04F", "#2297E6", "#28E2E5", "#D03AF5", "#EEC21F", "gray62"))
library(GenomicAlignments)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
library(rafalib)
library(cqn)
## Loading required package: mclust
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: nor1mix
## Loading required package: preprocessCore
## Loading required package: splines
## Loading required package: quantreg
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(Biobase)
library(edgeR)
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(mclust)
library(umap)
source("../../../methods/gcqn_validated.R")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.4
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact() masks XVector::compact()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks GenomicAlignments::last()
## ✖ purrr::map() masks mclust::map()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ purrr::simplify() masks DelayedArray::simplify()
## ✖ dplyr::slice() masks XVector::slice(), IRanges::slice()
library(viridis)
## Loading required package: viridisLite
library(qsmooth)
library(RUVSeq)
## Loading required package: EDASeq
## Loading required package: ShortRead
##
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
##
## id
## The following object is masked from 'package:purrr':
##
## compose
## The following object is masked from 'package:tibble':
##
## view
plotGCHex <- function(gr, counts){
counts2 <- counts
df <- as_tibble(cbind(counts2,gc=mcols(gr)$gc))
df <- gather(df, sample, value, -gc)
ggplot(data=df, aes(x=gc, y=log(value+1)) ) + ylab("log(count + 1)") + xlab("GC content") + geom_hex(bins = 50) + theme_bw() + facet_wrap(~sample, nrow=2)
}
plotRD <- function(rd, celltype, region, col=NULL, ...){
if(is.null(col)) col <- 1:nlevels(region)
plot(rd, pch=as.numeric(celltype)+15, col=col[region],
xlab="Dimension 1", ylab="Dimension 2", ...)
legend("bottomleft", c("neuronal", "non-neuronal"), pch=c(16,17), bty='n')
legend("topleft", levels(region), pch=16, col=col[1:nlevels(region)], bty='n')
}
counts <- as.matrix(read.table("../../../data/fullard2019/boca_raw_count_matrix.tsv", header=TRUE, stringsAsFactors = FALSE))
peaks <- read.table("../../../data/fullard2019/boca_peaks_consensus_no_blacklisted_regions.bed", header=FALSE, stringsAsFactors = FALSE)
colnames(peaks) <- c("chromosome", "start", "end", "name")
peakNames <- peaks$name
sn <- substr(peaks$chromosome, 4, sapply(peaks$chromosome, nchar))
start <- peaks$start
end <- peaks$end
gr <- GRanges(seqnames=sn, ranges = IRanges(start=start, end=end), strand="*")
names(gr) <- peaks$name
# get GC content
ff <- FaFile("~/data/genomes/human/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa")
peakSeqs <- getSeq(x=ff, gr)
gcContentPeaks <- letterFrequency(peakSeqs, "GC",as.prob=TRUE)[,1]
gcGroups <- Hmisc::cut2(gcContentPeaks, g=20)
mcols(gr)$gc <- gcContentPeaks
# design
# the data should consist of 2 cell types (neurons and non-neurons) across 14 distinct brain regions of 5 individuals
colnames(counts) <- gsub(x=colnames(counts),pattern="^X", replacement="")
individual <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 1)))
names(individual) <- colnames(counts)
celltype <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 2)))
names(celltype) <- colnames(counts)
region <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 3)))
names(region) <- colnames(counts)
## they also define groups of regions: https://bendlj01.u.hpc.mssm.edu/multireg/
regionBig <- as.character(region)
regionBig[region %in% c("DLPFC", "OFC", "VLPFC", "ACC", "STC", "ITC", "PMC", "INS")] <- "NCX"
regionBig[region %in% c("NAC", "PUT")] <- "STR"
regionBig <- factor(regionBig)
rawCounts <- counts
library(DESeq2)
dds <- DESeqDataSetFromMatrix(counts,
colData=data.frame(individual, celltype, region),
design=as.formula("~ individual + celltype*region"))
# calculate DESeq2 size factors
dds <- estimateSizeFactors(dds)
# extract the size factors and save them in an object
sf <- sizeFactors(dds)
# divide each row by the vector of size factors
normCountsDESeq2 <- sweep(counts, 2, sizeFactors(dds), "/")
library(edgeR)
d <- DGEList(counts)
d <- calcNormFactors(d)
nf <- d$samples$norm.factors
effLibSize <- colSums(counts) * nf
effLibSize_scaled <- effLibSize / mean(effLibSize)
# divide each row by the vector of size factors
normCountsEdgeR <- sweep(counts, 2, effLibSize_scaled, "/")
normCountsFQ <- FQnorm(counts, type="median")
# run cqn model
cqnObj <- cqn(counts, x=gcContentPeaks, lengths=width(gr))
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
# calculate normalized counts
normCountsCqn <- 2^(cqnObj$y + cqnObj$offset)
cqnplot(cqnObj, n=1)
cqnplot(cqnObj, n=2)
# run cqn model
cqnObj_noLength <- cqn(counts, x=gcContentPeaks, lengths=1000, lengthMethod="fixed")
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
# calculate normalized counts
normCountsCqn_noLength <- 2^(cqnObj$y + cqnObj$offset)
library(EDASeq)
wit <- withinLaneNormalization(as.matrix(counts), gcContentPeaks, which="full")
normCountsEDASeq <- betweenLaneNormalization(wit, which="full")
rm(wit)
gcGroups <- Hmisc::cut2(gcContentPeaks, g=50)
normCountsGcqn <- gcqn(counts, gcGroups, "median")
normCountsGcqnSmooth <- gcqn_qsmooth(counts, gcGroups, bio=droplevels(interaction(celltype, region)))
set.seed(44)
pcaPlot <- function(pca, regionBig, celltype){
require(ggplot2)
pctVar <- pca$sdev^2 / sum(pca$sdev^2)
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
df %>% ggplot(., aes(x=PC1, y=PC2, colour=regionBig, shape=celltype)) +
geom_point(size=3) +
theme_classic() +
xlab(paste0("PC 1 (",round(pctVar[1],3)*100,"%)")) +
ylab(paste0("PC 2 (",round(pctVar[2],3)*100,"%)"))
}
normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Gcqn", "GcqnSmooth")
titles <- c("None", "DESeq2", "edgeR", "Full quantile", "FQ-FQ", "cqn", "GC-FQ", "smooth GC-FQ")
pcaPlots <- list()
for(mm in 1:length(normMethods)){
curMethod <- normMethods[mm]
if(curMethod == "Raw"){
curCounts <- as.matrix(counts)
} else {
curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
}
pca <- prcomp(t(log1p(curCounts)))
pctVar <- pca$sdev^2 / sum(pca$sdev^2)
pcaPlots[[mm]] <- pcaPlot(pca, regionBig, celltype) + ggtitle(titles[mm])
}
names(pcaPlots) <- normMethods
cowplot::plot_grid(plotlist=pcaPlots)
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_pcaPlots.pdf", width=12, height=12)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.13.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:Biostrings':
##
## nnodes
## The following object is masked from 'package:stats':
##
## cutree
ddRaw <- dist(log1p(t(counts)), method="euclidean")
ddEdgeR <- dist(log1p(t(normCountsEdgeR)), method="euclidean")
ddDESeq2 <- dist(log1p(t(normCountsDESeq2)), method="euclidean")
ddFQ <- dist(log1p(t(normCountsFQ)), method="euclidean")
ddCqn <- dist(log1p(t(normCountsCqn)), method="euclidean")
ddEDASeq <- dist(log1p(t(normCountsEDASeq)), method="euclidean")
ddGCQN <- dist(log1p(t(normCountsGcqn)), method="euclidean")
ddGCQNSmooth <- dist(log1p(t(normCountsGcqnSmooth)), method="euclidean")
plotDD <- function(dist, col, label, main, cex=1/2, ...){
require(dendextend)
hdd <- hclust(dist)
dhdd <- as.dendrogram(hdd)
origLabels <- labels(dhdd)
labels(dhdd) <- label[origLabels]
labels_colors(dhdd) <- col[origLabels]
dhdd <- set(dhdd, "labels_cex", cex)
plot(dhdd, main=main, ...)
}
## cell type effect
colCelltype <- as.numeric(celltype)
names(colCelltype) <- names(celltype)
# pdf("~/Dropbox/research/atacseq/bulk/plots/hclust_celltype_fullard2018.pdf",
# width=7, height=5)
plotDD(ddRaw, col=colCelltype, label=celltype, main="Raw counts: celltype effect")
plotDD(ddEdgeR, col=colCelltype, label=celltype, main="edgeR norm: celltype effect")
plotDD(ddDESeq2, col=colCelltype, label=celltype, main="DESeq2 norm: celltype effect")
plotDD(ddFQ, col=colCelltype, label=celltype, main="FQ norm: celltype effect")
plotDD(ddCqn, col=colCelltype, label=celltype, main="Cqn norm: celltype effect")
plotDD(ddEDASeq, col=colCelltype, label=celltype, main="FQ-FQ norm: celltype effect")
plotDD(ddGCQN, col=colCelltype, label=celltype, main="GC-FQ norm: celltype effect")
plotDD(ddGCQNSmooth, col=colCelltype, label=celltype, main="smooth GC-FQ norm: celltype effect")
# dev.off()
## individual effect
colIndividual <- as.numeric(individual)
names(colIndividual) <- names(individual)
plotDD(ddRaw, col=colIndividual, label=individual, main="Raw counts: individual effect")
plotDD(ddEdgeR, col=colIndividual, label=individual, main="edgeR norm: individual effect")
plotDD(ddDESeq2, col=colIndividual, label=individual, main="DESeq2 norm: individual effect")
plotDD(ddFQ, col=colIndividual, label=individual, main="FQ norm: individual effect")
plotDD(ddCqn, col=colIndividual, label=individual, main="Cqn norm: individual effect")
plotDD(ddEDASeq, col=colIndividual, label=individual, main="EDASeq norm: individual effect")
plotDD(ddGCQN, col=colIndividual, label=individual, main="GC-QN norm: individual effect")
plotDD(ddGCQNSmooth, col=colIndividual, label=individual, main="smooth GC-QN norm: individual effect")
## region effect
colRegion <- as.numeric(region)
names(colRegion) <- names(region)
plotDD(ddRaw, col=colRegion, label=region, main="Raw counts: region effect")
plotDD(ddEdgeR, col=colRegion, label=region, main="edgeR norm: region effect")
plotDD(ddDESeq2, col=colRegion, label=region, main="DESeq2 norm: region effect")
plotDD(ddFQ, col=colRegion, label=region, main="FQ norm: region effect")
plotDD(ddCqn, col=colRegion, label=region, main="Cqn norm: region effect")
plotDD(ddEDASeq, col=colRegion, label=region, main="EDASeq norm: region effect")
plotDD(ddGCQN, col=colRegion, label=region, main="GC-QN norm: region effect")
plotDD(ddGCQNSmooth, col=colRegion, label=region, main="smooth GC-QN norm: region effect")
library(irlba)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## The following object is masked from 'package:S4Vectors':
##
## expand
library(uwot)
##
## Attaching package: 'uwot'
## The following object is masked from 'package:umap':
##
## umap
library(cluster)
library(mclust)
library(ggplot2)
umapDR <- function(counts, nPC=6){
pc <- irlba::irlba(log1p(counts), nv=nPC)
umDR <- uwot::umap(pc$v)
return(umDR)
}
pcs <- c(2, 5, 8, 10)
ariMatCT <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatCT) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
ariMatRegion <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatRegion) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
for(pp in 1:length(pcs)){
set.seed(pp)
nPC <- pcs[pp]
umRaw <- irlba::irlba(log1p(counts), nv=nPC)$v
umEdgeR <- irlba::irlba(log1p(normCountsEdgeR), nv=nPC)$v
umDESeq2 <- irlba::irlba(log1p(normCountsDESeq2), nv=nPC)$v
umFQ <- irlba::irlba(log1p(normCountsFQ), nv=nPC)$v
umCqn <- irlba::irlba(log1p(normCountsCqn), nv=nPC)$v
umEDASeq <- irlba::irlba(log1p(normCountsEDASeq), nv=nPC)$v
umGcqn <- irlba::irlba(log1p(normCountsGcqn), nv=nPC)$v
umGcqnSmooth <- irlba::irlba(log1p(normCountsGcqnSmooth), nv=nPC)$v
plotRD(umRaw, celltype, regionBig, main="Raw counts")
plotRD(umEdgeR, celltype, regionBig, main="edgeR counts")
plotRD(umDESeq2, celltype, regionBig, main="DESeq2 counts")
plotRD(umFQ, celltype, regionBig, main="FQ counts")
plotRD(umCqn, celltype, regionBig, main="Cqn counts")
plotRD(umEDASeq, celltype, regionBig, main="EDASeq counts")
plotRD(umGcqn, celltype, regionBig, main="GCQN counts")
plotRD(umGcqnSmooth, celltype, regionBig, main="smooth GCQN counts")
ariMatCT[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=2)$clustering, celltype)
ariMatCT[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=2)$clustering, celltype)
ariMatCT[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=2)$clustering, celltype)
ariMatCT[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=2)$clustering, celltype)
ariMatCT[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=2)$clustering, celltype)
ariMatCT[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=2)$clustering, celltype)
ariMatCT[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=2)$clustering, celltype)
ariMatCT[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=2)$clustering, celltype)
ariMatRegion[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
ariMatRegion[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
}
ariPlot <- function(ariMat){
ariMat <- as.data.frame(ariMat)
colnames(ariMat) <- paste0(pcs,"PC")
ariMatLong <- tidyr::gather(ariMat)
ariMatLong$method <- factor(rep(c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), length(pcs)))
ariMatLong$gc <- ifelse(ariMatLong$method %in% c("Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), TRUE, FALSE)
ariMatLong$key <- factor(ariMatLong$key, levels=c("2PC", "5PC", "8PC", "10PC"))
ggplot(ariMatLong, aes(x=key, y=value, color=method, shape=gc)) +
geom_jitter(width=.1, height=0, size=3) +
theme_classic() +
xlab("Number of PCs") +
ylab("Adjusted Rand Index") +
scale_shape_discrete(name="GC-aware")
}
# for clustering according to cell type, the decrease in ARI is lower for GC-aware methods
# possibly because they don't let technical GC effects contaminate the PCs.
ariPlot(ariMatCT)
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard2018_ariCelltype.pdf", width = 8)
ariRegion <- ariPlot(ariMatRegion)
ariRegion
barplot(rowMeans(ariMatRegion), las=2)
testEdgeR <- function(counts, design, L, tmm=FALSE, offset=NULL){
d <- DGEList(counts)
if(tmm) d <- calcNormFactors(d)
if(!is.null(offset)) d$offset <- offset
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrt <- glmLRT(fit, contrast=L)
lrt$table$padj <- p.adjust(lrt$table$PValue, "fdr")
return(lrt)
}
covarDf <- data.frame(region, celltype, individual)
testDESeq2 <- function(counts, covarDf, L){
dds <- DESeqDataSetFromMatrix(counts,
colData=covarDf,
design=as.formula("~ region*celltype + individual"))
dds <- estimateSizeFactors(dds)
dds <- DESeq(dds)
res <- results(dds, contrast = L)
return(res)
}
design <- model.matrix(~region*celltype + individual)
L_neur <- matrix(0, nrow=ncol(design), ncol=1)
rownames(L_neur) <- colnames(design)
L_neur[2:14,1] <- 1/14 #all regions of non-neuronal
L_neur["celltypeN",1] <- -1 #main neuronal effect
L_neur[grep(x=rownames(L_neur), pattern="^region.*celltypeN$"),1] <- -c(1/14) #all regions of neuronal
maPlotGC <- function(M, A, gc){
require(ggplot2)
df <- data.frame(M, A, gc)
df %>% ggplot(., aes(x=M, y=A, colour=gc)) +
geom_point(size=1/2, alpha=1/4) +
scale_color_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", breaks=quantile(gc, probs=seq(0,1,length=10)), labels=NULL) +
theme_classic() +
theme(legend.position = "none") +
geom_hline(aes(yintercept=0)) +
geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
xlab("Average log counts per million") +
ylab("Log2 fold change")
}
maPlotGC_hex <- function(M, A, gc, ng=20){
require(ggplot2)
df <- data.frame(M, A, gc)
df %>% ggplot(., aes(x=M, y=A)) +
stat_summary_hex(aes(z = gc), fun="mean", show.legend=FALSE, bins = 20) +
theme_classic() +
scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=ng, type="continuous"), name="GC content", labels=NULL) + #, limits=c(0.2,0.775)) +
geom_hline(aes(yintercept=0)) +
geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
xlab("Average log counts per million") +
ylab("Log2 fold change")
}
gcBoxplot <- function(df, title){
ggplot(df) +
aes(x=gc, y=logFC, color=gc) +
geom_violin() +
geom_boxplot(width=0.1) +
scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(df$gc), "continuous")) +
geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
theme_bw() +
ylim(c(-2,2)) +
xlab("GC-content bin") +
ggtitle(title) +
theme(axis.text.x = element_text(size=0),
legend.position = "none",
axis.title = element_text(size=16))
}
normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Cqn_noLength", "Gcqn", "GcqnSmooth")
# lrtOut <- list()
# for(mm in 4:length(normMethods)){
# curMethod <- normMethods[mm]
# if(curMethod == "Raw"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
# next
# } else if (curMethod == "EdgeR"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, tmm=TRUE)
# next
# } else if (curMethod == "DESeq2"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testDESeq2(curCounts, covarDf, L_neur)
# next
# } else if (curMethod == "Cqn"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj$glm.offset)
# next
# } else if(curMethod == "Cqn_noLength"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj_noLength$glm.offset)
# next
# } else if(curMethod == "RUV"){
# curCounts <- as.matrix(rawCounts)
# lrtOut[[mm]] <- testEdgeR(curCounts, designRUV, L_neurRUV)
# } else if(curMethod %in% c("FQ", "EDASeq", "Gcqn", "GcqnSmooth")){
# curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
# lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
# }
# }
# saveRDS(lrtOut, file="lrtOut2.rds")
lrtOut <- readRDS("lrtOut2.rds")
titles <- c("Raw", "DESeq2", "edgeR", "FQ", "FQ-FQ", "cqn", "cqn_noLength", "GC-FQ", "smooth GC-FQ")
pList <- c()
for(kk in 1:length(lrtOut)){
if(kk == 2){
pList[[kk]] <- maPlotGC(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
ggtitle(titles[kk])
} else {
pList[[kk]] <- maPlotGC(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
ggtitle(titles[kk])
}
}
cowplot::plot_grid(plotlist=pList)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.pdf", width=8, height=8)
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.png", width=8, height=8, dpi=150)
pListHex <- c()
for(kk in 1:length(lrtOut)){
if(kk == 2){
pListHex[[kk]] <- maPlotGC_hex(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
} else {
pListHex[[kk]] <- maPlotGC_hex(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
}
}
names(pListHex) <- titles
cowplot::plot_grid(plotlist=pListHex)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 15 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 1 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots_hex.pdf", width=8, height=8)
pListBox <- list()
gcGroups20 <- Hmisc::cut2(gcContentPeaks, g=20)
for(kk in 1:length(lrtOut)){
if(kk == 2){
df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$log2FoldChange)
pListBox[[kk]] <- gcBoxplot(df, titles[kk])
} else {
df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$table$logFC)
pListBox[[kk]] <- gcBoxplot(df, titles[kk])
}
}
cowplot::plot_grid(plotlist=pListBox)
## Warning: Removed 13086 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13086 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14262 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14262 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14740 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14740 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14761 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14761 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11456 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11456 rows containing non-finite values (stat_boxplot).
## Warning: Removed 22130 rows containing non-finite values (stat_ydensity).
## Warning: Removed 22130 rows containing non-finite values (stat_boxplot).
## Warning: Removed 17948 rows containing non-finite values (stat_ydensity).
## Warning: Removed 17948 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13351 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13351 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10943 rows containing non-finite values (stat_ydensity).
## Warning: Removed 10943 rows containing non-finite values (stat_boxplot).
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_boxplots.pdf", width=16, height=10)
library(UpSetR)
daPeaks <- list()
for(kk in 1:length(lrtOut)){
if(kk == 2){
daPeaks[[kk]] <- which(lrtOut[[kk]]$padj <= 0.05)
} else {
daPeaks[[kk]] <- which(p.adjust(lrtOut[[kk]]$table$PValue, "fdr") <= 0.05)
}
}
barplot(unlist(lapply(daPeaks, length)),
names=normMethods, ylab="Nr DA peaks")
names(daPeaks) <- titles
# pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_upset.pdf")
upset(fromList(daPeaks), nsets=9, nintersects=10, keep.order=TRUE, order.by="freq")
# dev.off()
M=lrtOut[[1]]$table$logCPM
A=lrtOut[[1]]$table$logFC
gc=gcContentPeaks
df <- data.frame(M, A, gc)
hlp <- df %>% ggplot(., aes(x=M, y=A)) +
stat_summary_hex(aes(z = gc)) +
theme_classic() +
scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", labels=NULL) +
geom_hline(aes(yintercept=0)) +
geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
xlab("Average log counts per million") +
ylab("Log2 fold change")
leg <- ggpubr::get_legend(hlp)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cols <- wesanderson::wes_palette("Zissou1", n=10, type="continuous")
pcaAriPlots <- cowplot::plot_grid(pcaPlots[["EDASeq"]] + ggtitle("") + labs(colour="Region", shape="Cell type") + coord_fixed(),
ariRegion + scale_color_viridis_d(),
nrow=2, ncol=1, labels=c("a", "b"))
maPlots <- cowplot::plot_grid(plotlist=pListHex[c("edgeR", "FQ", "FQ-FQ", "cqn")],
nrow=2, ncol=2)
## Warning: Removed 1 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cowplot::plot_grid(pcaAriPlots,
maPlots,
leg,
labels=c("", "c",""),
nrow=1, ncol=3,
rel_widths = c(1,1.2,.3))
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_mainFigure.pdf", width=13, height=9)
# overlap with known genomic features
# note that hg19 is synonym for GRCh37
library(ChIPpeakAnno)
## Loading required package: grid
## Loading required package: VennDiagram
## Loading required package: futile.logger
##
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:dendextend':
##
## rotate
##
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(EnsDb.Hsapiens.v75)
## Loading required package: ensembldb
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
library(org.Hs.eg.db)
##
data(TSS.human.GRCh37)
peakFeaturesAll <- suppressWarnings(assignChromosomeRegion(gr, nucleotideLevel=FALSE,
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesAll$percentage, las=1) # across all peaks
Enriched GO terms on intersection of peaks make a lot of sense.
intersectDAPeaks <- Reduce(intersect, daPeaks)
peakFeaturesIst <- suppressWarnings(assignChromosomeRegion(gr[intersectDAPeaks],
nucleotideLevel=FALSE,
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
# intersection is enriched in expected biological features
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
mcols=mcols(annoData))
istPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[intersectDAPeaks],
AnnotationData=annoData,
output="nearestBiDirectionalPromoters",
bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goRes <- getEnrichedGO(istPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05,
minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResUniq <- lapply(goRes, function(tab){
tab2 <- unique(tab[,1:10])
tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
rownames(tab2) <- NULL
tab2
})
head(goResUniq$bp)
## go.id
## 1 GO:0010243
## 2 GO:0000122
## 3 GO:0035601
## 4 GO:0031329
## 5 GO:0070431
## 6 GO:0098732
## go.term
## 1 response to organonitrogen compound
## 2 negative regulation of transcription by RNA polymerase II
## 3 protein deacylation
## 4 regulation of cellular catabolic process
## 5 nucleotide-binding oligomerization domain containing 2 signaling pathway
## 6 macromolecule deacylation
## Definition
## 1 Any process that results in a change in state or activity of a cell or an organism (in terms of movement, secretion, enzyme production, gene expression, etc.) as a result of an organonitrogen stimulus. An organonitrogen compound is formally a compound containing at least one carbon-nitrogen bond.
## 2 Any process that stops, prevents, or reduces the frequency, rate or extent of transcription mediated by RNA polymerase II.
## 3 The removal of an acyl group, any group or radical of the form RCO- where R is an organic group, from a protein amino acid.
## 4 Any process that modulates the frequency, rate or extent of the chemical reactions and pathways resulting in the breakdown of substances, carried out by individual cells.
## 5 Any series of molecular signals generated as a consequence of binding to nucleotide-binding oligomerization domain containing 2 (NOD2).
## 6 The removal of an acyl group, any group or radical of the form RCO- where R is an organic group, from a macromolecule.
## Ontology pvalue count.InDataset count.InGenome totaltermInDataset
## 1 BP 1.326880e-05 262 998 329316
## 2 BP 1.679815e-05 224 839 329316
## 3 BP 1.897572e-05 40 103 329316
## 4 BP 2.016790e-05 231 871 329316
## 5 BP 2.179805e-05 10 13 329316
## 6 BP 2.470798e-05 40 104 329316
## totaltermInGenome BH.adjusted.p.value
## 1 1593489 0.03653406
## 2 1593489 0.03653406
## 3 1593489 0.03653406
## 4 1593489 0.03653406
## 5 1593489 0.03653406
## 6 1593489 0.03653406
head(goResUniq$mf)
## go.id go.term
## 1 GO:0019899 enzyme binding
## Definition Ontology
## 1 Interacting selectively and non-covalently with any enzyme. MF
## pvalue count.InDataset count.InGenome totaltermInDataset
## 1 1.592692e-05 529 2217 51719
## totaltermInGenome BH.adjusted.p.value
## 1 255581 0.04131444
head(goResUniq$cc)
## go.id go.term
## 1 GO:0017053 transcriptional repressor complex
## 2 GO:0045202 synapse
## 3 GO:0097458 neuron part
## 4 GO:0005654 nucleoplasm
## 5 GO:0030054 cell junction
## 6 GO:0042995 cell projection
## Definition
## 1 A protein complex that possesses activity that prevents or downregulates transcription.
## 2 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 3 Any constituent part of a neuron, the basic cellular unit of nervous tissue. A typical neuron consists of a cell body (often called the soma), an axon, and dendrites. Their purpose is to receive, conduct, and transmit impulses in the nervous system.
## 4 That part of the nuclear content other than the chromosomes or the nucleolus.
## 5 A cellular component that forms a specialized region of connection between two or more cells or between a cell and the extracellular matrix. At a cell junction, anchoring proteins extend through the plasma membrane to link cytoskeletal proteins in one cell to cytoskeletal proteins in neighboring cells or to proteins in the extracellular matrix.
## 6 A prolongation or process extending from a cell, e.g. a flagellum or axon.
## Ontology pvalue count.InDataset count.InGenome totaltermInDataset
## 1 CC 7.715988e-08 38 84 90270
## 2 CC 3.531184e-05 284 1171 90270
## 3 CC 7.818478e-05 401 1729 90270
## 4 CC 8.998435e-05 774 3512 90270
## 5 CC 9.418562e-05 308 1298 90270
## 6 CC 2.206383e-04 491 2178 90270
## totaltermInGenome BH.adjusted.p.value
## 1 463063 0.0001124991
## 2 463063 0.0257423287
## 3 463063 0.0274645272
## 4 463063 0.0274645272
## 5 463063 0.0274645272
## 6 463063 0.0497110353
xtable::xtable(head(goResUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:53:19 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## 1 & GO:0010243 & response to organonitrogen compound & 0.0365 \\
## 2 & GO:0000122 & negative regulation of transcription by RNA polymerase II & 0.0365 \\
## 3 & GO:0035601 & protein deacylation & 0.0365 \\
## 4 & GO:0031329 & regulation of cellular catabolic process & 0.0365 \\
## 5 & GO:0070431 & nucleotide-binding oligomerization domain containing 2 signaling pathway & 0.0365 \\
## 6 & GO:0098732 & macromolecule deacylation & 0.0365 \\
## 7 & GO:0050768 & negative regulation of neurogenesis & 0.0365 \\
## 8 & GO:0051961 & negative regulation of nervous system development & 0.0365 \\
## 9 & GO:0032990 & cell part morphogenesis & 0.0365 \\
## 10 & GO:0022008 & neurogenesis & 0.0398 \\
## 11 & GO:0007399 & nervous system development & 0.0398 \\
## 12 & GO:0045665 & negative regulation of neuron differentiation & 0.0398 \\
## 13 & GO:0120039 & plasma membrane bounded cell projection morphogenesis & 0.0398 \\
## 14 & GO:0006476 & protein deacetylation & 0.0398 \\
## 15 & GO:0048667 & cell morphogenesis involved in neuron differentiation & 0.0398 \\
## 16 & GO:0048709 & oligodendrocyte differentiation & 0.0398 \\
## 17 & GO:0048666 & neuron development & 0.0398 \\
## 18 & GO:0048858 & cell projection morphogenesis & 0.0459 \\
## 19 & GO:0007417 & central nervous system development & 0.0473 \\
## 20 & GO:1901698 & response to nitrogen compound & 0.0473 \\
## \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:53:19 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## 1 & GO:0017053 & transcriptional repressor complex & 0.0001 \\
## 2 & GO:0045202 & synapse & 0.0257 \\
## 3 & GO:0097458 & neuron part & 0.0275 \\
## 4 & GO:0005654 & nucleoplasm & 0.0275 \\
## 5 & GO:0030054 & cell junction & 0.0275 \\
## 6 & GO:0042995 & cell projection & 0.0497 \\
## 7 & GO:0005829 & cytosol & 0.0497 \\
## \hline
## \end{tabular}
## \end{table}
cqnPeaks <- unique(c(daPeaks$cqn, daPeaks$cqn_noLength))
otherPeaks <- unique(do.call(c, daPeaks[!names(daPeaks) %in% c("cqn", "cqn_noLength")]))
cqnUniquePeaks <- cqnPeaks[!cqnPeaks %in% otherPeaks]
### interesting genomic features are depleted, re-encouraging these could be FP results
peakFeaturesCqn <- suppressWarnings(assignChromosomeRegion(gr[cqnUniquePeaks],
nucleotideLevel=FALSE,
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)
## GO enrichment recovers no gene sets.
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
mcols=mcols(annoData))
cqnPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[cqnUniquePeaks],
AnnotationData=annoData,
output="nearestBiDirectionalPromoters",
bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResCqn <- getEnrichedGO(cqnPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05,
minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResCqnUniq <- lapply(goResCqn, function(tab){
tab2 <- unique(tab[,1:10])
tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
rownames(tab2) <- NULL
tab2
})
head(goResCqnUniq$bp)
## [1] go.id go.term Definition
## [4] Ontology pvalue count.InDataset
## [7] count.InGenome totaltermInDataset totaltermInGenome
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResCqnUniq$mf)
## [1] go.id go.term Definition
## [4] Ontology pvalue count.InDataset
## [7] count.InGenome totaltermInDataset totaltermInGenome
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResCqnUniq$cc)
## [1] go.id go.term Definition
## [4] Ontology pvalue count.InDataset
## [7] count.InGenome totaltermInDataset totaltermInGenome
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
xtable::xtable(head(goResCqnUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:54:08 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResCqnUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:54:08 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(cqnPeaksAnnotated) / length(cqnUniquePeaks)
## [1] 0.04378117
gcfqPeaks <- unique(c(daPeaks$`FQ-FQ`, daPeaks$`GC-FQ`))
fqPeaks <- daPeaks$FQ
gcfqUniquePeaks <- gcfqPeaks[!gcfqPeaks %in% fqPeaks]
### genomic features
peakFeaturesGcfq <- suppressWarnings(assignChromosomeRegion(gr[gcfqUniquePeaks],
nucleotideLevel=FALSE,
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)
## GO enrichment recovers no gene sets.
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
mcols=mcols(annoData))
gcfqPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[gcfqUniquePeaks],
AnnotationData=annoData,
output="nearestBiDirectionalPromoters",
bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResgcfq <- getEnrichedGO(gcfqPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05,
minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResGcfqUniq <- lapply(goResgcfq, function(tab){
tab2 <- unique(tab[,1:10])
tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
rownames(tab2) <- NULL
tab2
})
head(goResGcfqUniq$bp)
## go.id go.term
## 1 GO:0006813 potassium ion transport
## 2 GO:0032990 cell part morphogenesis
## 3 GO:0120039 plasma membrane bounded cell projection morphogenesis
## 4 GO:0048858 cell projection morphogenesis
## 5 GO:0048812 neuron projection morphogenesis
## 6 GO:0043266 regulation of potassium ion transport
## Definition
## 1 The directed movement of potassium ions (K+) into, out of or within a cell, or between cells, by means of some agent such as a transporter or pore.
## 2 The process in which the anatomical structures of a cell part are generated and organized.
## 3 The process in which the anatomical structures of a plasma membrane bounded cell projection are generated and organized.
## 4 The process in which the anatomical structures of a cell projection are generated and organized.
## 5 The process in which the anatomical structures of a neuron projection are generated and organized. A neuron projection is any process extending from a neural cell, such as axons or dendrites.
## 6 Any process that modulates the frequency, rate or extent of the directed movement of potassium ions (K+) into, out of or within a cell, or between cells, by means of some agent such as a transporter or pore.
## Ontology pvalue count.InDataset count.InGenome totaltermInDataset
## 1 BP 8.204788e-06 48 240 166187
## 2 BP 1.084568e-05 108 685 166187
## 3 BP 1.097839e-05 105 662 166187
## 4 BP 1.421059e-05 105 666 166187
## 5 BP 1.985465e-05 102 648 166187
## 6 BP 2.227999e-05 26 105 166187
## totaltermInGenome BH.adjusted.p.value
## 1 1593489 0.02700587
## 2 1593489 0.02700587
## 3 1593489 0.02700587
## 4 1593489 0.02700587
## 5 1593489 0.02700587
## 6 1593489 0.02700587
head(goResGcfqUniq$mf)
## [1] go.id go.term Definition
## [4] Ontology pvalue count.InDataset
## [7] count.InGenome totaltermInDataset totaltermInGenome
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResGcfqUniq$cc)
## go.id go.term
## 1 GO:0045202 synapse
## 2 GO:0036477 somatodendritic compartment
## 3 GO:0097458 neuron part
## 4 GO:0044456 synapse part
## 5 GO:0043005 neuron projection
## 6 GO:0030425 dendrite
## Definition
## 1 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 2 The region of a neuron that includes the cell body (cell soma) and dendrite(s), but excludes the axon.
## 3 Any constituent part of a neuron, the basic cellular unit of nervous tissue. A typical neuron consists of a cell body (often called the soma), an axon, and dendrites. Their purpose is to receive, conduct, and transmit impulses in the nervous system.
## 4 Any constituent part of a synapse, the junction between a nerve fiber of one neuron and another neuron or muscle fiber or glial cell.
## 5 A prolongation or process extending from a nerve cell, e.g. an axon or dendrite.
## 6 A neuron projection that has a short, tapering, morphology. Dendrites receive and integrate signals from other neurons or from sensory stimuli, and conduct nerve impulses towards the axon or the cell body. In most neurons, the impulse is conveyed from dendrites to axon via the cell body, but in some types of unipolar neuron, the impulse does not travel via the cell body.
## Ontology pvalue count.InDataset count.InGenome totaltermInDataset
## 1 CC 1.022663e-07 185 1171 49927
## 2 CC 5.990199e-06 133 843 49927
## 3 CC 9.847623e-06 244 1729 49927
## 4 CC 1.095598e-05 144 938 49927
## 5 CC 2.843742e-05 189 1312 49927
## 6 CC 3.236236e-05 100 620 49927
## totaltermInGenome BH.adjusted.p.value
## 1 463063 0.0001223105
## 2 463063 0.0032758371
## 3 463063 0.0032758371
## 4 463063 0.0032758371
## 5 463063 0.0048911883
## 6 463063 0.0048911883
xtable::xtable(head(goResGcfqUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:55:02 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## 1 & GO:0006813 & potassium ion transport & 0.0270 \\
## 2 & GO:0032990 & cell part morphogenesis & 0.0270 \\
## 3 & GO:0120039 & plasma membrane bounded cell projection morphogenesis & 0.0270 \\
## 4 & GO:0048858 & cell projection morphogenesis & 0.0270 \\
## 5 & GO:0048812 & neuron projection morphogenesis & 0.0270 \\
## 6 & GO:0043266 & regulation of potassium ion transport & 0.0270 \\
## 7 & GO:0098655 & cation transmembrane transport & 0.0270 \\
## 8 & GO:0031175 & neuron projection development & 0.0270 \\
## 9 & GO:0048666 & neuron development & 0.0270 \\
## 10 & GO:0030182 & neuron differentiation & 0.0270 \\
## 11 & GO:0120036 & plasma membrane bounded cell projection organization & 0.0320 \\
## 12 & GO:0050808 & synapse organization & 0.0320 \\
## 13 & GO:0099536 & synaptic signaling & 0.0320 \\
## 14 & GO:0099537 & trans-synaptic signaling & 0.0331 \\
## 15 & GO:0071804 & cellular potassium ion transport & 0.0352 \\
## 16 & GO:0071805 & potassium ion transmembrane transport & 0.0352 \\
## 17 & GO:0055085 & transmembrane transport & 0.0408 \\
## 18 & GO:0050804 & modulation of chemical synaptic transmission & 0.0467 \\
## 19 & GO:0030030 & cell projection organization & 0.0467 \\
## 20 & GO:0051641 & cellular localization & 0.0467 \\
## \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResGcfqUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun 3 14:55:02 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
## \hline
## & go.id & go.term & BH.adjusted.p.value \\
## \hline
## 1 & GO:0045202 & synapse & 0.0001 \\
## 2 & GO:0036477 & somatodendritic compartment & 0.0033 \\
## 3 & GO:0097458 & neuron part & 0.0033 \\
## 4 & GO:0044456 & synapse part & 0.0033 \\
## 5 & GO:0043005 & neuron projection & 0.0049 \\
## 6 & GO:0030425 & dendrite & 0.0049 \\
## 7 & GO:0098978 & glutamatergic synapse & 0.0049 \\
## 8 & GO:0030424 & axon & 0.0049 \\
## 9 & GO:0097447 & dendritic tree & 0.0049 \\
## 10 & GO:1904813 & ficolin-1-rich granule lumen & 0.0055 \\
## 11 & GO:0098794 & postsynapse & 0.0087 \\
## 12 & GO:0033267 & axon part & 0.0087 \\
## 13 & GO:0120025 & plasma membrane bounded cell projection & 0.0108 \\
## 14 & GO:0097060 & synaptic membrane & 0.0117 \\
## 15 & GO:0042995 & cell projection & 0.0193 \\
## 16 & GO:0045211 & postsynaptic membrane & 0.0193 \\
## 17 & GO:1990234 & transferase complex & 0.0374 \\
## 18 & GO:1990752 & microtubule end & 0.0405 \\
## 19 & GO:0034703 & cation channel complex & 0.0428 \\
## 20 & GO:0000777 & condensed chromosome kinetochore & 0.0428 \\
## \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(gcfqPeaksAnnotated) / length(gcfqUniquePeaks)
## [1] 0.1387382
# pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_genomicFeatures.pdf", width=9)
par(mfrow=c(3,1))
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage,
main="Intersection across normalization methods",
xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage,
main="GC-FQ and FQ-FQ but not FQ",
xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage,
main="Peaks only found by cqn",
xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
# dev.off()